# coding: utf-8
'''
Created on 18 juin 2013

@author: Nils Amiet
'''

import nltk

def isEnglishTweet(text):
    '''
    Checks that the ratio of unknown words in the given text does not exceed a threshold.
    Words are checked against an English dictionary of 235k words provided by NLTK
    '''
    filterList = ["#", "RT", ".", ":", ",", ";", "'", "(", ")", "{", "}", "[", "]", "~", "\"", "?", "!"]
    
    for sign in filterList:
        text = text.replace(sign, "")
        
    text = [word for word in text.split(" ") if not word.startswith("http") and not word.startswith("@")]
            
    englishWords = set(w.lower() for w in nltk.corpus.words.words())
    textWords = set(w.lower() for w in text)
    unknownWords = textWords - englishWords
    
    unknownCount = len(unknownWords)
    textCount = len(textWords)
    
    unknownFraction = unknownCount / float(textCount)
    threshold = 0.5
    
    return unknownFraction <= threshold